Introduction

The goal of this project is to wrangling data using techniques we learn through the class: plotting, reformatting variables, cleaning the data, create new features, map, data visualization, web scraping etc. The dataset used in this project is NYC taxi trip duration from Kaggle https://www.kaggle.com/c/nyc-taxi-trip-duration. This competition is to predict taxi trip duation in NYC, but in this project I didn’t do prediction part, jsut manipulating data. Thus, I will mostly use traning set of data which contains pickup/dropoff datetime and coordinates, passenger number, different taxi companies, and trip duration.

library('ggplot2')
library('scales') 
library('grid') 
library('RColorBrewer')
library('corrplot')
library('alluvial') 
library('dplyr') 
library('readr') 
library('data.table') 
library('tibble')
library('tidyr') 
library('stringr') 
library('forcats') 
library('lubridate')
library('geosphere')
library('leaflet')
library('leaflet.extras')
library('maps') 
## Warning: package 'maps' was built under R version 3.4.4
library('xgboost')
library('caret')
## Warning: package 'caret' was built under R version 3.4.4
library('ggthemes')
## Warning: package 'ggthemes' was built under R version 3.4.4
library('knitr')
library('scales')
library('ggmap')
library('RColorBrewer')
library('treemap')
# gis packages
library('sp')
library('rgeos')
library('geosphere')

Data loading

train <- as.tibble(fread('train.csv'))
test <- as.tibble(fread('test.csv'))
full <- bind_rows(train %>% mutate(dset = "train"), 
                     test %>% mutate(dset = "test",
                                     dropoff_datetime = NA,
                                     trip_duration = NA))

Data Exploying

summary(train)
##       id              vendor_id     pickup_datetime    dropoff_datetime  
##  Length:1458644     Min.   :1.000   Length:1458644     Length:1458644    
##  Class :character   1st Qu.:1.000   Class :character   Class :character  
##  Mode  :character   Median :2.000   Mode  :character   Mode  :character  
##                     Mean   :1.535                                        
##                     3rd Qu.:2.000                                        
##                     Max.   :2.000                                        
##  passenger_count pickup_longitude  pickup_latitude dropoff_longitude
##  Min.   :0.000   Min.   :-121.93   Min.   :34.36   Min.   :-121.93  
##  1st Qu.:1.000   1st Qu.: -73.99   1st Qu.:40.74   1st Qu.: -73.99  
##  Median :1.000   Median : -73.98   Median :40.75   Median : -73.98  
##  Mean   :1.665   Mean   : -73.97   Mean   :40.75   Mean   : -73.97  
##  3rd Qu.:2.000   3rd Qu.: -73.97   3rd Qu.:40.77   3rd Qu.: -73.96  
##  Max.   :9.000   Max.   : -61.34   Max.   :51.88   Max.   : -61.34  
##  dropoff_latitude store_and_fwd_flag trip_duration    
##  Min.   :32.18    Length:1458644     Min.   :      1  
##  1st Qu.:40.74    Class :character   1st Qu.:    397  
##  Median :40.75    Mode  :character   Median :    662  
##  Mean   :40.75                       Mean   :    959  
##  3rd Qu.:40.77                       3rd Qu.:   1075  
##  Max.   :43.92                       Max.   :3526282
glimpse(train)
## Observations: 1,458,644
## Variables: 11
## $ id                 <chr> "id2875421", "id2377394", "id3858529", "id3...
## $ vendor_id          <int> 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2...
## $ pickup_datetime    <chr> "2016-03-14 17:24:55", "2016-06-12 00:43:35...
## $ dropoff_datetime   <chr> "2016-03-14 17:32:30", "2016-06-12 00:54:38...
## $ passenger_count    <int> 1, 1, 1, 1, 1, 6, 4, 1, 1, 1, 1, 4, 2, 1, 1...
## $ pickup_longitude   <dbl> -73.98215, -73.98042, -73.97903, -74.01004,...
## $ pickup_latitude    <dbl> 40.76794, 40.73856, 40.76394, 40.71997, 40....
## $ dropoff_longitude  <dbl> -73.96463, -73.99948, -74.00533, -74.01227,...
## $ dropoff_latitude   <dbl> 40.76560, 40.73115, 40.71009, 40.70672, 40....
## $ store_and_fwd_flag <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N"...
## $ trip_duration      <int> 455, 663, 2124, 429, 435, 443, 341, 1551, 2...
glimpse(test)
## Observations: 625,134
## Variables: 9
## $ id                 <chr> "id3004672", "id3505355", "id1217141", "id2...
## $ vendor_id          <int> 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1...
## $ pickup_datetime    <chr> "2016-06-30 23:59:58", "2016-06-30 23:59:53...
## $ passenger_count    <int> 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 4, 1, 1, 1, 1...
## $ pickup_longitude   <dbl> -73.98813, -73.96420, -73.99744, -73.95607,...
## $ pickup_latitude    <dbl> 40.73203, 40.67999, 40.73758, 40.77190, 40....
## $ dropoff_longitude  <dbl> -73.99017, -73.95981, -73.98616, -73.98643,...
## $ dropoff_latitude   <dbl> 40.75668, 40.65540, 40.72952, 40.73047, 40....
## $ store_and_fwd_flag <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N"...

Features summary

individual feature analysis

# take a look at trip duration 

# rect <- data.frame(xmin=1e+04, xmax=1e+05, ymin=-Inf, ymax=Inf) 
#fail to put two rectangles on outliers
ggplot(data = train, aes(x = trip_duration))+
  geom_histogram(bins = 100, color = "green")+
  geom_vline(xintercept = 10, linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = 3600*22, linetype = "dashed", color = "red", size = 1) + 
  scale_x_log10()+
  scale_y_sqrt()

par(mfrow=c(1,1))
# take a  look at passenger count
ggplot(data = train, mapping = aes(x = passenger_count))+
  geom_bar( fill  = "blue")

#exact counts
table(train$passenger_count)
## 
##       0       1       2       3       4       5       6       7       8 
##      60 1033540  210318   59896   28404   78088   48333       3       1 
##       9 
##       1
# mean and median of passenger count
boxplot(train$passenger_count,col ="pink", ylab = "passenger_count", main= "passenger_count,mean(magenta),median(red)")
abline(h = mean(train$passenger_count),col= "magenta")
abline(h = median(train$passenger_count, col = "red", lwd = 2))

# two different taxi companies
ggplot(data = train, mapping =aes(x = vendor_id ) )+
  geom_bar()

# exact counts
table(train$vendor_id)
## 
##      1      2 
## 678342 780302
# store and fwd flag
ggplot(data = train, mapping =aes(x = store_and_fwd_flag ) )+
  geom_bar()

# exact numbers
table(train$store_and_fwd_flag)
## 
##       N       Y 
## 1450599    8045
#pickup and dropoff datetime
#datetime formate transform
train$pickup_datetime = ymd_hms(train$pickup_datetime)
train$dropoff_datetime = ymd_hms(train$dropoff_datetime)
train$month<- lubridate::month(train$pickup_datetime, label = TRUE) 
train$wday <- lubridate::wday (train$pickup_datetime, label = TRUE)
train$hour <- lubridate::hour(train$pickup_datetime)
train$date <- lubridate::day(train$pickup_datetime)
train$minutes <-lubridate:: minute(train$pickup_datetime)
#pickup
ggplot(data = train, mapping =aes(x = pickup_datetime ) )+
   geom_histogram(fill = "blue", bins = 120)

#dropoff
ggplot(data = train, mapping =aes(x = dropoff_datetime ) )+
geom_histogram(fill = "blue", bins = 120)

# by month
ggplot(data = train, mapping =aes(x = month ) )+
  geom_bar(fill = "green")

# by days of a week
ggplot(data = train, mapping =aes(x = wday ) )+
  geom_bar(fill = "green")

# by date
ggplot(data = train, mapping =aes(x = date ) )+
  geom_bar(fill = "blue")

# by hour
ggplot(data = train, mapping =aes(x = hour ) )+
  geom_bar(fill = "blue")

- the pickups and dropoffs are nearly identical since the trips are likely start and end on same day.

#long and lat visualization in NYC
min_lat <- 40.6
max_lat <- 40.9
min_long <- -74.05
max_long <- -73.7

ggplot(train, aes(x=pickup_longitude, y=pickup_latitude)) +
geom_jitter(size=0.6, color = "white") +
scale_x_continuous(limits=c(min_long, max_long)) +
scale_y_continuous(limits=c(min_lat, max_lat))+
theme_dark()

# passenger > 5
ggplot(train %>% filter( passenger_count>4), aes(x=pickup_longitude,y=pickup_latitude, color=vendor_id)) +
  geom_jitter(size=0.06) +
  scale_x_continuous(limits=c(min_long,max_long)) +
  scale_y_continuous(limits=c(min_lat,max_lat)) +
  theme_dark() +
  scale_color_gradient(low="#CCCCCC", high="#8E44AD", trans="log") +
  labs(title = "Map of NYC") +
  theme(legend.position="none") +
  coord_equal()

#check data points that are fake: not in NYC area

#load map data
state <- map_data("state")
nyc <- state %>% filter(region=="new york")
county <- map_data("county")
nyc_county <- county %>% filter(region=="new york")

set <- rbind(
  train %>% dplyr::select(pickup_longitude, pickup_latitude) %>% mutate(set="tr"),
  test %>% dplyr::select(pickup_longitude, pickup_latitude) %>% mutate(set="te"))


#we can check some coords are outside the United States as well as NYC

#filter coords outside NYC 
out <- train %>%
  filter((pickup_longitude > max(nyc$long) | pickup_longitude < min(nyc$long)) | (pickup_latitude > max(nyc$lat) | pickup_latitude < min(nyc$lat)))

#See those coords in details with real map 
leaflet(data=out, width="100%") %>%
  addTiles() %>%
  addMarkers(~pickup_longitude, ~pickup_latitude, popup=as.character(out$id))

feature relation

#vender id vs others
#month vs vender id
train %>%
ggplot( aes(x = month, fill = as.factor(vendor_id)))+
  geom_bar(position ="dodge")+
  labs(y = "count", x = "Months")

# days of a week vs vender id
train %>%
ggplot(aes(x = wday, fill = as.factor(vendor_id)))+
geom_bar(position ="dodge")+
labs(y = "Total number of pickups", x = "Days of the week")

# hours of a day vs vender id
train %>%
ggplot(aes(x = hour, fill = as.factor(vendor_id)))+
geom_bar()+
labs(y = "Total number of pickups", x = "Hours")

# trip duation vs vender id
train %>%
  ggplot(aes(trip_duration, fill =  as.factor(vendor_id))) +
  geom_density(position = "stack") +
  scale_x_log10()

#hours of a day vs others
#hours of a day vs month
train %>%
ggplot(aes(x = hour,fill = month))+
  geom_bar(position = "dodge")+
  facet_grid(~month, scales = "free")+
  scale_fill_manual(labels= levels(month), values=c("red","blue","green","brown","pink","magenta"))+ 
  labs(y = "Total number of pickups", x = "Hours")

#hours of a day vs days of a week
train %>%
ggplot(aes(x = hour,color= wday))+
  geom_bar(position = "dodge")+
  facet_grid(~wday, scales = "free")+
  scale_fill_manual(labels= c("sun","mon","tue","wed","Thur","fri","sat"),values=c("red","blue","green","brown","pink","magenta","yellow"))+ 
  labs(y = "Total number of pickups", x = "Hours")

#hours of a day vs minuetes in an hour
train %>%
ggplot(aes(x = minutes,fill=as.factor(hour)))+
  geom_bar(position = "dodge")+
  facet_grid(~ hour, scales = "free")+
  labs(y = "Total number of pickups", x = "Minutes")

build new features

#direct distance for a trip
pick_coord <- train %>%
  dplyr::select(pickup_longitude, pickup_latitude)
drop_coord <- train %>%
  dplyr::select(dropoff_longitude, dropoff_latitude)
train <- train %>%
  mutate(dist = distCosine(pick_coord, drop_coord))


train %>%
  ggplot(aes(dist, trip_duration)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = "Direct distance [m]", y = "Trip duration [s]")
## Warning: Transformation introduced infinite values in continuous x-axis

# after excluding extreme
train %>%
  filter(trip_duration < 3600 & trip_duration > 120) %>%
  filter(dist > 100 & dist < 100e3) %>%
  ggplot(aes(dist, trip_duration)) +
  geom_bin2d(bins = c(500,500)) +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = "Direct distance [m]", y = "Trip duration [s]")

# speed in a trip 
train <- train %>%
  mutate(speed = dist/trip_duration*3.6)
train %>%
  filter(speed > 2 & speed < 1e2) %>%
  ggplot(aes(speed)) +
  geom_histogram(fill = "red", bins = 50) +
  labs(x = "Average speed ")

train %>%
  group_by(wday, vendor_id) %>%
  summarise(median_speed = median(speed)) %>%
  ggplot(aes(wday, median_speed, color = vendor_id)) +
  geom_point(size = 4) +
  labs(x = "Day of the week", y = "Median speed [km/h]")

train %>%
  group_by(hour, vendor_id) %>%
  summarise(median_speed = median(speed)) %>%
  ggplot(aes(hour, median_speed, color = vendor_id)) +
  geom_smooth(method = "loess", span = 1/2) +
  geom_point(size = 4) +
  labs(x = "Hour of the day", y = "Median speed [km/h]") +
  theme(legend.position = "none")

train %>%
  group_by(wday, hour) %>%
  summarise(median_speed = median(speed)) %>%
  ggplot(aes(hour, wday, fill = median_speed)) +
  geom_tile() +
  labs(x = "Hour of the day", y = "Day of the week") +
  scale_fill_distiller(palette = "Spectral")

#direction
train$bearing = bearing(pick_coord, drop_coord)

train %>%
  filter(dist < 1e5) %>%
  ggplot(aes(bearing, dist)) +
  geom_bin2d(bins = c(100,100)) +
  labs(x = "Bearing", y = "Direct distance") +
  scale_y_log10() +
  theme(legend.position = "none") +
  coord_polar() +
  scale_x_continuous(breaks = seq(-180, 180, by = 45))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 5897 rows containing non-finite values (stat_bin2d).

train %>%
  filter(trip_duration < 3600*22) %>%
  filter(dist < 1e5) %>%
  ggplot(aes(bearing, trip_duration)) +
  geom_bin2d(bins = c(100,100)) +
  scale_y_log10() +
  labs(x = "Bearing", y = "Trip duration") +
  coord_polar() +
  scale_x_continuous(breaks = seq(-180, 180, by = 45))

train %>%
  filter(speed < 75 & dist < 1e5) %>%
  ggplot(aes(bearing, speed)) +
  geom_bin2d(bins = c(100,100)) +
  labs(x = "Bearing", y = "Speed") +
  coord_polar() +
  scale_x_continuous(breaks = seq(-180, 180, by = 45))

DATA CLEANING

#remove coords outside NYC
# wired trips with duration < 10 sec or > 22 hours
#nearly 0 distance and tiny duration

train <- train %>%
  filter(!id %in% out$id,
         trip_duration < 22*3600,
         trip_duration > 10,
         dist > 0 | (near(dist, 0) & trip_duration < 60),
         speed < 100)

external datasets

weather <- as.tibble(fread("weather_data_nyc_centralpark_2016.csv"))
# Reformating and cleaning weather dataset
weather <- weather %>%
  mutate(date1 = dmy(date),
         rain = as.numeric(ifelse(precipitation == "T", "0.01", precipitation)),
         s_fall = as.numeric(ifelse(`snow fall` == "T", "0.01", `snow fall`)),
         s_depth = as.numeric(ifelse(`snow depth` == "T", "0.01", `snow depth`)),
         all_precip = s_fall + rain,
         has_snow = (s_fall > 0) | (s_depth > 0),
         has_rain = rain > 0,
         max_temp = `maximum temperature`,
         min_temp = `minimum temperature`)
add_Info <- weather %>%
  select(date1, rain, s_fall, all_precip, has_snow, has_rain, s_depth, max_temp, min_temp)
train <- train %>%
  mutate(date1 = date(pickup_datetime))

train <- left_join(train, add_Info, by = "date1")


train %>%
  group_by(date1) %>%
  summarise(trips = n(),
            snow_fall = mean(s_fall),
            rain_fall = mean(rain),
            all_precip = mean(all_precip)) %>%
  ggplot(aes(date1, snow_fall)) +
  geom_line(color = "blue", size = 1.5) +
  labs(x = "", y = "Snowfall") +
  scale_y_sqrt() +
  scale_x_date(limits = ymd(c("2015-12-28", "2016-06-30")))

train %>%
  group_by(date1) %>%
  summarise(trips = n(),
            snow_depth = mean(s_depth)) %>%
  ggplot(aes(date1, snow_depth)) +
  geom_line(color = "purple", size = 1.5) +
  labs(x = "", y = "Snow depth") +
  scale_y_sqrt() +
  scale_x_date(limits = ymd(c("2015-12-29", "2016-06-30")))

ggplot(data = train, mapping =aes(x = pickup_datetime ) )+
   geom_histogram(fill = "blue", bins = 120)

train %>%
  group_by(date1) %>%
  summarise(median_speed = median(speed)) %>%
  ggplot(aes(date1, median_speed)) +
  geom_line(color = "orange", size = 1.5) +
  labs(x = "Date", y = "Median speed")

Future Study

I was going to import external data of neighborhood in NYC from Zillow. The file format is shapefile. I spent a lot of time but cannot read in into R. The other challenge is web scraping. It is simple to get the table from HTML source page, but if the website has some anti-scraping techniques then it’s another level of coding. This intereted me a lot. In the future study, I plan to develop skills of web crawling. I think it’s an important skills for Data Scientists since everything we do are based on data. Mastering web crawling skills could let us get most of the data we want. Also, there are many job positions specifically for web crawling.